{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 聚类\n",
    "\n",
    "熟悉各中聚类算法的调用\n",
    "并用评价指标选择合适的超参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "import time\n",
    "\n",
    "from collections import defaultdict\n",
    "import csv\n",
    "from itertools import islice\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n"
     ]
    }
   ],
   "source": [
    "# 获取只在训练集train.csv和测试集test.cv出现的活动\n",
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "# 倒排表\n",
    "# 统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "\n",
    "for filename in [\"train.csv\", \"test.csv\"]:\n",
    "    with open(filename) as cf:\n",
    "        lines=csv.reader(cf)\n",
    "        for line in islice(lines, 1, None):\n",
    "            uniqueUsers.add(line[0])\n",
    "            uniqueEvents.add(line[1])\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)\n",
    "\n",
    "eventInfoList = []\n",
    "names = []\n",
    "with open('events.csv') as cf:\n",
    "    lines = csv.reader(cf)\n",
    "    for row, line in enumerate(lines):\n",
    "        if len(uniqueEvents) == 0:\n",
    "            break\n",
    "        if 0 == row:\n",
    "            names = line\n",
    "        else:\n",
    "            if line[0] in uniqueEvents:\n",
    "                eventInfoList.append(line)\n",
    "                uniqueEvents.discard(line[0])\n",
    "\n",
    "event = pd.DataFrame(columns=names, data=eventInfoList)\n",
    "event.to_csv('Event.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 111 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0    event_id     user_id                start_time city state  \\\n",
       "0           0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN   \n",
       "1           1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN   \n",
       "2           2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN   \n",
       "3           3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN   \n",
       "4           4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN   \n",
       "\n",
       "   zip country  lat  lng   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  \\\n",
       "0  NaN     NaN  NaN  NaN   ...        0     1     0     0     0     0     0   \n",
       "1  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "2  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "3  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "4  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "\n",
       "   c_99  c_100  c_other  \n",
       "0     0      0        9  \n",
       "1     0      0        7  \n",
       "2     0      0       12  \n",
       "3     0      0        8  \n",
       "4     0      0        9  \n",
       "\n",
       "[5 rows x 111 columns]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读入数据\n",
    "# Event.csv是从events.csv中获取的、包含train,csv和test.csv中的event\n",
    "\n",
    "events = pd.read_csv('Event.csv')\n",
    "events.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c_1</th>\n",
       "      <th>c_2</th>\n",
       "      <th>c_3</th>\n",
       "      <th>c_4</th>\n",
       "      <th>c_5</th>\n",
       "      <th>c_6</th>\n",
       "      <th>c_7</th>\n",
       "      <th>c_8</th>\n",
       "      <th>c_9</th>\n",
       "      <th>c_10</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   c_1  c_2  c_3  c_4  c_5  c_6  c_7  c_8  c_9  c_10   ...     c_92  c_93  \\\n",
       "0    2    0    2    0    0    0    0    0    0     0   ...        0     1   \n",
       "1    2    0    2    0    0    0    0    0    0     0   ...        0     0   \n",
       "2    0    0    0    0    0    0    0    0    0     0   ...        0     0   \n",
       "3    1    0    2    1    0    0    0    0    0     0   ...        0     0   \n",
       "4    1    1    0    0    0    0    0    2    0     0   ...        0     0   \n",
       "\n",
       "   c_94  c_95  c_96  c_97  c_98  c_99  c_100  c_other  \n",
       "0     0     0     0     0     0     0      0        9  \n",
       "1     0     0     0     0     0     0      0        7  \n",
       "2     0     0     0     0     0     0      0       12  \n",
       "3     0     0     0     0     0     0      0        8  \n",
       "4     0     0     0     0     0     0      0        9  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 获取count特征\n",
    "countPer = events.iloc[:, 10:]\n",
    "\n",
    "countPer.head()\n",
    "#countPer.to_csv('EventCountCol.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.array(countPer)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型，在校验集上评价聚类算法性能\n",
    "def K_cluster_analysis(K, X_train):\n",
    "    start = time.time()\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(X_train)\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    #CH_score = metrics.calinski_harabaz_score(X_train,mb_kmeans.predict(X_train))\n",
    "    CH_score = metrics.silhouette_score(X_train,mb_kmeans.predict(X_train))\n",
    "       \n",
    "    end = time.time()\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 10\n",
      "CH_score: 0.39599071613882814, time elaps:5\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.27738726201172254, time elaps:2\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.2249453495268702, time elaps:2\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.15630417373907468, time elaps:2\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.1548537307996946, time elaps:2\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.12632415265882668, time elaps:2\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.10703350440659992, time elaps:2\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.1054214589297374, time elaps:2\n",
      "K-means begin with clusters: 90\n",
      "CH_score: 0.10559986358536681, time elaps:2\n",
      "K-means begin with clusters: 100\n",
      "CH_score: 0.05643583015678696, time elaps:2\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = range(10,110,10)\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, data)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x7fb14866ed68>]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAH1NJREFUeJzt3Xl4VPW9x/H3l7CJUqUlWGRLFKTSSsWOKIrgVVS4WlCsGq0WaysPXmhr0SqCyy1uuFE3tFCvSnstFNDWaK9SFddrVcJ1aYGigCLBLUpL1VIQ+N4/fid1iIGZJJM5kzmf1/PMQ86WfDPP8DknZ/n+zN0REZFkaBV3ASIikj8KfRGRBFHoi4gkiEJfRCRBFPoiIgmi0BcRSRCFvohIgij0RUQSRKEvIpIgreMuoK7OnTt7WVlZ3GWIiLQoS5Ys+cDdSzOtV3ChX1ZWRlVVVdxliIi0KGa2Jpv1dHpHRCRBFPoiIgmSVeib2XAzW2FmK81s0k7WO8nM3MxSafMujrZbYWbH5qJoERFpnIzn9M2sBJgBHA1UA4vNrNLdl9VZryPwI+CFtHn9gArgq8BewGNmtq+7b83dryAiItnK5kh/ILDS3Ve7+2ZgLjCqnvWuAK4F/pk2bxQw1903ufsbwMro+4mISAyyCf1uwNq06epo3r+Y2YFAD3f/fUO3FRGR/GnyhVwzawVMB85vwvcYa2ZVZlZVU1PT1JJERGQHsgn9dUCPtOnu0bxaHYGvAU+a2ZvAIUBldDE307YAuPssd0+5e6q0NOOzBfX629/gkktgxYpGbS4ikgjZhP5ioI+ZlZtZW8KF2crahe6+wd07u3uZu5cBzwMj3b0qWq/CzNqZWTnQB3gx578FsHkzTJ8OV1/dHN9dRKQ4ZAx9d98CTAAWAsuBee6+1MymmtnIDNsuBeYBy4BHgPHNdedOly4wbhzcey+sWtUcP0FEpOUzd4+7hu2kUilvbBuGd96B8nI44wy4884cFyYiUsDMbIm7pzKtV1RP5HbtCuecA7Nnw5qsulCIiCRLUYU+wIUXghlMmxZ3JSIihafoQr9HDzj7bLjrLlj3ufuERESSrehCH2DSJNi2Da67Lu5KREQKS1GGflkZnHkmzJoF774bdzUiIoWjKEMfYPLkcO/+DTfEXYmISOEo2tDv3RtOPx3uuAPU2UFEJCja0AeYMgU2bgxP6oqISJGH/le+AqecArfdBuvXx12NiEj8ijr0IRztf/wx3HRT3JWIiMSv6EN///1h9Gi45RbYsCHuakRE4lX0oQ+h5fKGDXDrrXFXIiISr0SE/oABcPzx8LOfwUcfxV2NiEh8EhH6AJdeGi7m3n573JWIiMQnMaE/cCAce2x4WOuTT+KuRkQkHokJfYDLLoMPPoCZM+OuREQkHokK/UMPhSOPhOuvDw9tiYgkTaJCH8K5/Xff1chaIpJMiQv9oUPh8MPh2mth06a4qxERya/Ehb5ZONpftw7uvjvuakRE8iur0Dez4Wa2wsxWmtmkepaPM7M/mdnLZvasmfWL5peZ2cZo/stm9vNc/wKNMWwYHHJIGFLx00/jrkZEJH8yhr6ZlQAzgBFAP+C02lBP82t339/dDwCuA9L7Wq5y9wOi17hcFd4UtUf7a9bAr34VdzUiIvmTzZH+QGClu692983AXGBU+gru/ve0yV0Bz12JzWPECPjGN+Cqq2DLlrirERHJj2xCvxuwNm26Opq3HTMbb2arCEf6P0xbVG5mL5nZU2Z2eJOqzaHao/3Vq2HOnLirERHJj5xdyHX3Ge6+D3ARcEk0+x2gp7sPACYCvzazL9Td1szGmlmVmVXV5HGYq5EjoX9/uPJK2Lo1bz9WRCQ22YT+OqBH2nT3aN6OzAVOAHD3Te7+YfT1EmAVsG/dDdx9lrun3D1VWlqabe1NVnu0/9prMH9+3n6siEhssgn9xUAfMys3s7ZABVCZvoKZ9UmbPA54PZpfGl0Ixsz2BvoAq3NReK6MHg39+oWj/W3b4q5GRKR5ZQx9d98CTAAWAsuBee6+1MymmtnIaLUJZrbUzF4mnMYZE80fArwazV8AjHP3ghq4sFWrMLrW0qXw29/GXY2ISPMy98K60SaVSnlVVVVef+bWreFof5dd4KWXwmkfEZGWxMyWuHsq03qJeyK3PiUlMHkyvPIKPPhg3NWIiDQfhX7k9NOhvByuuAIK7I8fEZGcUehH2rQJR/tVVbBwYdzViIg0D4V+mu98B3r2hKlTdbQvIsVJoZ+mbVuYNAn++EdYtCjuakREck+hX8d3vwt77RWO9kVEio1Cv4727eHCC+Hpp8NLRKSYKPTrcc45sOee4U4eEZFiotCvR4cOcMEF8Nhj4fy+iEixUOjvwLhx0LmzjvZFpLgo9Hdgt91g4kR4+OFw776ISDFQ6O/E+PHQqZOO9kWkeCj0d+ILX4DzzoPKSnj55birERFpOoV+Bj/8YQj/K6+MuxIRkaZT6Gewxx4h+O+7L/TcFxFpyRT6WTjvvHBh96qr4q5ERKRpFPpZ+NKX4D/+A+bOhRUr4q5GRKTxFPpZOv/80KLh6qvjrkREpPEU+lnq0iU8sHXvvbBqVdzViIg0jkK/AX7yE2jdGq65Ju5KREQaJ6vQN7PhZrbCzFaa2aR6lo8zsz+Z2ctm9qyZ9UtbdnG03QozOzaXxedb166hGdvs2bBmTdzViIg0XMbQN7MSYAYwAugHnJYe6pFfu/v+7n4AcB0wPdq2H1ABfBUYDtwefb8W68ILwQymTYu7EhGRhsvmSH8gsNLdV7v7ZmAuMCp9BXf/e9rkrkDtYIOjgLnuvsnd3wBWRt+vxerRA84+G+66C6qr465GRKRhsgn9bsDatOnqaN52zGy8ma0iHOn/sCHbtjSTJsG2bXDddXFXIiLSMDm7kOvuM9x9H+Ai4JKGbGtmY82sysyqampqclVSsykrgzPPhF/8At59N+5qRESyl03orwN6pE13j+btyFzghIZs6+6z3D3l7qnS0tIsSorf5MmweTPccEPclYiIZC+b0F8M9DGzcjNrS7gwW5m+gpn1SZs8Dng9+roSqDCzdmZWDvQBXmx62fHr3RtOPx3uuANawB8nIiJAFqHv7luACcBCYDkwz92XmtlUMxsZrTbBzJaa2cvARGBMtO1SYB6wDHgEGO/uW5vh94jFlCmwcSNMnx53JSIi2TF3z7xWHqVSKa9qQUNVVVTA738Pb74ZevSIiMTBzJa4eyrTenoit4mmTIGPP4abb467EhGRzBT6TbT//jB6NNxyC/ztb3FXIyKycwr9HLjkEtiwAW69Ne5KRER2TqGfAwMGwDe/CTfdBB99FHc1IiI7ptDPkUsvhfXr4fbb465ERGTHFPo5ctBBcOyx4WGtTz6JuxoRkfop9HPossvggw9g5sy4KxERqZ9CP4cOPRSOPDI0YtPRvogUIoV+jv30p/D++3D00fDhh3FXIyKyPYV+jg0eDPPnw5IlcPjhsHZt5m1ERPJFod8MTjoJ/vAHWLcOBg2CpUvjrkhEJFDoN5OhQ+GZZ8JgK4MHw7PPxl2RiIhCv1n17w/PPQdduoRz/A88EHdFIpJ0Cv1mVlYG//u/YQcwenQYbUtEJC4K/Tzo3BkWLQoPb40dC1OnQoF1tBaRhFDo58muu4bTO2PGwOWXw/jxsLVohpMRkZaiddwFJEmbNnD33fDlL8O118J778G990L79nFXJiJJoSP9PDODadPgZz+D+++H4cPVh19E8kehH5PzzoM5c8LdPUOGwNtvx12RiCSBQj9GFRXwP/8Db7wR+vasWBF3RSJS7BT6MRs2DJ58EjZuhMMOgxdeiLsiESlmWYW+mQ03sxVmttLMJtWzfKKZLTOzV83scTPrlbZsq5m9HL0qc1l8sfjGN8Jpnt13D106H3447opEpFhlDH0zKwFmACOAfsBpZtavzmovASl37w8sAK5LW7bR3Q+IXiNzVHfR2WefEPxf+UoYenH27LgrEpFilM2R/kBgpbuvdvfNwFxgVPoK7v6Eu/8jmnwe6J7bMpNhzz3DqZ4jjoCzzgp9+fUQl4jkUjah3w1IbxBcHc3bke8B6Sco2ptZlZk9b2Yn1LeBmY2N1qmqqanJoqTi1bEj/P734SLvRRfBxImhaZuISC7k9OEsMzsDSAFD02b3cvd1ZrY3sMjM/uTuq9K3c/dZwCyAVCqV+GPbdu3CQ1tf/jLcdFN4iOuee6Bt27grE5GWLpvQXwf0SJvuHs3bjpkNA6YAQ919U+18d18X/bvazJ4EBgCr6m4v22vVCqZPh65dwxF/TU14mKtjx7grE5GWLJvTO4uBPmZWbmZtgQpgu7twzGwAMBMY6e7vp83vZGbtoq87A4cBy3JVfLEzgwsvDEf5TzwRzvW/917cVYlIS5Yx9N19CzABWAgsB+a5+1Izm2pmtXfjXA/sBsyvc2vmfkCVmb0CPAFMc3eFfgONGQOVlfCXv4R7+Vfp7yQRaSTzArs9JJVKeVVVVdxlFKQXXoDjjoOSknAv/4EHxl2RiBQKM1vi7qlM6+mJ3Bbk4IPDgCy77BKGY3zssbgrEpGWRqHfwvTtGx7iKi+Hf/93mDs37opEpCVR6LdAe+0FTz8NgwbBaafBzTfHXZGItBQK/RZqjz1g4cIw7u5558GkSXp6V0QyU+i3YO3bw7x5MG5cGInrrLPg00/jrkpECpmGS2zhSkrg9tvDKZ/LLgsPcc2fH8bkFRGpS0f6RcAMLr0UZs4Mp3yOOgo++CDuqkSkECn0i8jYsXDfffDKKzB4MKxZE3dFIlJoFPpF5oQT4NFHQ7uGQw+Ft96KuyIRKSQK/SI0eDA89RR88gkcfzx89FHcFYlIoVDoF6n+/cOdPcuWhXv5t26NuyIRKQQK/SJ2zDFw661hUJYLLoi7GhEpBLpls8idey6sWBEGY+nbN9zTLyLJpdBPgBtvhNdfhwkTwgDsRx8dd0UiEhed3kmAkpLQmK1fPzj5ZFi+PO6KRCQuCv2E6NgRHnwwtG447rjw5K6IJI9CP0F69YIHHoB33oETT4RNmzJvIyLFRaGfMAcfDLNnh8FYvv99deYUSRpdyE2gU06B114L/Xr69oVLLom7IhHJF4V+Qk2ZEm7lvPRS2HffsCMQkeKX1ekdMxtuZivMbKWZTapn+UQzW2Zmr5rZ42bWK23ZGDN7PXqNyWXx0nhmcOedcNhhMGZMGHRdRIpfxtA3sxJgBjAC6AecZmb96qz2EpBy9/7AAuC6aNsvApcDBwMDgcvNrFPuypemaNcOfvtb6NoVRo1SczaRJMjmSH8gsNLdV7v7ZmAuMCp9BXd/wt3/EU0+D3SPvj4WeNTd17v7X4FHgeG5KV1yobQUHnoINm5UczaRJMgm9LsBa9Omq6N5O/I94OGGbGtmY82sysyqanQDed716wcLFqg5m0gS5PSWTTM7A0gB1zdkO3ef5e4pd0+VlpbmsiTJ0tFHw223qTmbSLHL5u6ddUCPtOnu0bztmNkwYAow1N03pW17RJ1tn2xModL8xo2Dv/xFzdlEilk2R/qLgT5mVm5mbYEKoDJ9BTMbAMwERrr7+2mLFgLHmFmn6ALuMdE8KVA33hjaNEyYAH/4Q9zViEiuZQx9d98CTCCE9XJgnrsvNbOpZjYyWu16YDdgvpm9bGaV0bbrgSsIO47FwNRonhSokhKYM+ez5mzLlsVdkYjkknmBPYefSqW8qqoq7jIS7623YOBA6NAh3MOvSy0ihc3Mlrh7KtN66r0j9erZU83ZRIqRQl92SM3ZRIqPeu/ITqk5m0hxUehLRmrOJlI8dHpHMlJzNpHiodCXrNQ2Z9trr9Ccbc2auCsSkcZQ6EvWapuz/fOf8M1vqjmbSEuk0JcG2W8/mD9fzdlEWiqFvjRYenO288+PuxoRaQjdvSONMm5cuKOntjnbuefGXZGIZEOhL412ww3w+uvwgx/APvvAMcfEXZGIZKLTO9Jotc3ZvvpVNWcTaSkU+tIkHTvCgw/CLruE4RY18JlIYVPoS5P17AmVlWrOJtISKPQlJwYOhF/+Us3ZRAqdQl9y5uST4cor4b//G666Ku5qRKQ+untHcmryZDVnEylkOtKXnDKDX/wCBg9WczaRQqTQl5xTczaRwqXTO9IsOncOzdkGDYLevcNYu23bhh1C27bbf53tvMZskz6vZ88wLZJkWYW+mQ0HbgZKgDvdfVqd5UOAm4D+QIW7L0hbthX4UzT5lruPzEXhUvj22w8WLYLf/Cbcxrl5c3jVfl133ief7Hz55s1Na/C2557wk5+EFhK77pq731OkJTHPcG+dmZUArwFHA9XAYuA0d1+Wtk4Z8AXgAqCyTuh/7O67ZVtQKpXyqqqqBvwKkiRbt2beedS3/B//gHvvhcceC3+FnH8+jB8fHi4TKQZmtsTdU5nWy+ZIfyCw0t1XR994LjAK+Ffou/ub0bJtjapWJEslJeHp3112afi2Z50Fzz0HV1wBF18M118PP/5x6B20++45L1WkIGVzIbcbsDZtujqal632ZlZlZs+b2Qn1rWBmY6N1qmr0HL80o0MPhYcfhhdfDMM/Xnop9OoFl18O69fHXZ1I88vH3Tu9oj85TgduMrN96q7g7rPcPeXuqdLS0jyUJEl30EGhdcT//R8ceSRMnQplZWEQ+A8+iLs6keaTTeivA3qkTXeP5mXF3ddF/64GngQGNKA+kWY1YADcfz+8+iqMGAHXXBPC/8IL4b334q5OJPeyCf3FQB8zKzeztkAFUJnNNzezTmbWLvq6M3AYadcCRArF/vuHu4yWLoUTToAbb4Tycpg4MTSSEykWGUPf3bcAE4CFwHJgnrsvNbOpZjYSwMwOMrNq4GRgppktjTbfD6gys1eAJ4Bp6Xf9iBSa/fYLvYOWLw8tJG65JYT/D34A1dVxVyfSdBlv2cw33bIphWT16nDK5557oFUrOPtsmDQpXPwVKSTZ3rKpNgwiO7H33qGX0MqV8L3vwV13hSeMv/99WLUq7upEGk6hL5KFXr3g9ttD0J97bjgF1LdvuPf/tdfirk4kewp9kQbo3j2c53/jDfjRj2DevHAd4Nvf1hjB0jIo9EUaoWvXcIfPm2/CBRfAAw/A174WLv6++mrc1YnsmEJfpAm6dIFrrw3hP3kyPPIIfP3rMHo0vPRS3NWJfJ5CXyQHOncOQ0WuWRNaOixaBAceCCNHwuLFcVcn8hmFvkgOdeoE//mfIfyvvDIMFD9wYHja97nn4q5ORKEv0ix23z308XnzTZg2DZYsCQ3ehg0Lo4pt3Bh3hZJUCn2RZtSxI1x0Ubjb58YbQ5uH0aPDtYDTTgt9f7QDkHxS6Ivkwa67hj4+a9fCo4/C6aeHAV1OOglKS6GiAu67Lwz2ItKcFPoiedS6dTjFM3NmaOT22GNwxhnhwu+3vhX+Ajj1VFiwQDsAaR4KfZGYtG4NRx0FP/85vP02PP44nHkmPPkknHxy+AvglFNg/vwwfrBILij0RQpA69ZhMJc77gg7gEWLYMwYeOqpEPxduoQdwbx52gFI0yj0RQpMSQn827+FXj9vvw1PPBF6/DzzTDj1U1oaTgX95jfw8cdxVystjUJfpICVlMARR8CMGbBuXTj1c/bZ4f7/iorwF8BJJ8HcudoBSHYU+iItREkJDB0Kt90WBnR56qnQ7vmPfwy3f5aWhttB58yBjz6Ku1opVAp9kRaopASGDIFbbw23gT79dOjx//zz4XbQLl3gxBPh17/WDkC2p9AXaeFKSuDww8MOoLo67ADOOQdefDG0fC4t1Q5APqPhEkWK1LZtod/P/Pnhvv+334Z27WD48LAT6NYtTO/s1b59uLNICl+2wyUq9EUSYNu2cO6/dgewbl3227ZqlXnnULuDyGa99HXbtoXaCHLf/lV3Xqbppmyz995hR2jW+Pc4bjkNfTMbDtwMlAB3uvu0OsuHADcB/YEKd1+QtmwMcEk0eaW7z97Zz1LoizSvbdvgz3+GDRtg06adv/75z8zrZLvu1q1x/+Y7N3ZsuEjepk3clTROtqGf8Q83MysBZgBHA9XAYjOrdPf0weHeAs4CLqiz7ReBy4EU4MCSaNu/ZvuLiEhutWoF/fvn/+du3fr5HcTmzeHouvYFDZvOxToQOqFefXUYA3n+/NAiu1hlc7ZuILDS3VcDmNlcYBTwr9B39zejZdvqbHss8Ki7r4+WPwoMB+Y0uXIRaVFKSqBDh/AqNFddBfvuGy6ADxoEDz0EvXvHXVXzyObunW7A2rTp6mheNpqyrYhI3owZExrg1dTAwQeHu6CKUUHcsmlmY82sysyqampq4i5HRBJqyBB44YVwm+uwYTB7p1cgW6ZsQn8d0CNtuns0LxtZbevus9w95e6p0tLSLL+1iEju9e4d7nQaMiT0PJo8OVz8LhbZhP5ioI+ZlZtZW6ACqMzy+y8EjjGzTmbWCTgmmiciUrA6dYKHHw539FxzTeh0WizjG2QMfXffAkwghPVyYJ67LzWzqWY2EsDMDjKzauBkYKaZLY22XQ9cQdhxLAam1l7UFREpZG3ahLEObrwxDGs5dGgY+Kal08NZIiIZVFaGnkadOsGDD8IBB8Rd0edle59+QVzIFREpZCNHwrPPhq8HDw7B31Ip9EVEsnDAAaGJ3X77wahRMH36Z20cWhKFvohIlrp2DeMYjB4N558P48bBp5/GXVXDKPRFRBqgQ4cwVvHFF8OsWTBiBPy1BTWWUeiLiDRQq1ahV88994QndwcNCn17WgKFvohII7XE1g0KfRGRJqht3dC5c2jd8Mtfxl3Rzin0RUSaqLZ1w+GHh6P/KVMKt3WDQl9EJAc6dYJHHgntma++Gk49tTBbNyj0RURypE0bmDkztG64777CbN2g0BcRySEzmDgRfvc7WL4cBg6EV16Ju6rPKPRFRJpBbesGdzjssMJp3aDQFxFpJoXYukGhLyLSjPbaK7RuOPHEwmjdoNAXEWlmHTrA/PmF0bpBoS8ikge1rRvuvjs8uXvoofG0blDoi4jk0VlnhdYN778fWjc880x+f75CX0Qkz2pbN3zpS3DUUflt3aDQFxGJQe/e8Pzz+W/doNAXEYlJ3dYNFRWwdWvz/sysQt/MhpvZCjNbaWaT6lnezsx+Ey1/wczKovllZrbRzF6OXj/PbfkiIi1beuuGvn2hpKR5f17rTCuYWQkwAzgaqAYWm1mluy9LW+17wF/dvbeZVQDXAqdGy1a5ewGOHS8iUhhqWzfkQzZH+gOBle6+2t03A3OBUXXWGQXMjr5eABxlZpa7MkVEJBeyCf1uwNq06epoXr3ruPsWYAPwpWhZuZm9ZGZPmdnhTaxXRESaIOPpnSZ6B+jp7h+a2TeA35nZV9397+krmdlYYCxAz549m7kkEZHkyuZIfx3QI226ezSv3nXMrDWwO/Chu29y9w8B3H0JsArYt+4PcPdZ7p5y91RpaWnDfwsREclKNqG/GOhjZuVm1haoACrrrFMJjIm+/hawyN3dzEqjC8GY2d5AH2B1bkoXEZGGynh6x923mNkEYCFQAtzl7kvNbCpQ5e6VwH8BvzKzlcB6wo4BYAgw1cw+BbYB49x9fXP8IiIikpl53M2d60ilUl5VVRV3GSIiLYqZLXH3VKb19ESuiEiCFNyRvpnVAGvirqOJOgMfxF1EAdH7sT29H5/Re7G9prwfvdw9450wBRf6xcDMqrL5Mysp9H5sT+/HZ/RebC8f74dO74iIJIhCX0QkQRT6zWNW3AUUGL0f29P78Rm9F9tr9vdD5/RFRBJER/oiIgmi0G8iM+thZk+Y2TIzW2pmP4rmf9HMHjWz16N/O8Vda76YWUnUWfWhaLo8GlxnZTTYTtu4a8wXM9vDzBaY2V/MbLmZDUr4Z+PH0f+TP5vZHDNrn6TPh5ndZWbvm9mf0+bV+3mw4JbofXnVzA7MRQ0K/abbApzv7v2AQ4DxZtYPmAQ87u59gMej6aT4EbA8bfpa4Gfu3hv4K2HQnaS4GXjE3b8CfJ3wviTys2Fm3YAfAil3/xqhrUvtoEtJ+XzcAwyvM29Hn4cRhH5lfQhdiO/ISQXurlcOX8ADhFHGVgBdo3ldgRVx15an37979ME9EngIMMLDJq2j5YOAhXHXmaf3YnfgDaJrZ2nzk/rZqB1344uEvl8PAccm7fMBlAF/zvR5AGYCp9W3XlNeOtLPoWhs4AHAC8Ce7v5OtOhdYM+Yysq3m4ALCQ32IAym8zcPg+tA/YPwFKtyoAa4OzrddaeZ7UpCPxvuvg64AXiLMNbGBmAJyf181NrR5yGbAawaTKGfI2a2G3AfcJ7XGSTGw2666G+TMrPjgfc9jJ0g4Wj2QOAOdx8AfEKdUzlJ+WwAROeqRxF2hnsBu/L5Ux2Jlo/Pg0I/B8ysDSHw73X3+6PZ75lZ12h5V+D9uOrLo8OAkWb2JmEs5SMJ57T3iAbXgfoH4SlW1UC1u78QTS8g7ASS+NkAGAa84e417v4pcD/hM5PUz0etHX0eshnAqsEU+k0UDQD/X8Byd5+etih9YJkxhHP9Rc3dL3b37u5eRrhAt8jdvw08QRhcBxLyXgC4+7vAWjPrG806ClhGAj8bkbeAQ8ysQ/T/pvb9SOTnI82OPg+VwHeiu3gOATaknQZqND2c1URmNhh4BvgTn53Hnkw4rz8P6EnoGnqKJ2gAGTM7ArjA3Y+PRk2bS7iA9xJwhrtvirO+fDGzA4A7gbaEUeO+SzjYSuRnw8x+CpxKuOvtJeD7hPPUifh8mNkc4AhCN833gMuB31HP5yHaMd5GOAX2D+C77t7kwUYU+iIiCaLTOyIiCaLQFxFJEIW+iEiCKPRFRBJEoS8ikiAKfRGRBFHoi4gkiEJfRCRB/h/hhDU+lB6etwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "按照Calinski-Harabasz Index评价指标，最佳的K取值为10。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
